#import cds object creacted by Loyal
dat.cds<-readRDS("dat.cds.rds")

#subset the cds object by NPCs
dat.npc<-dat.cds[,pData(dat.cds)$cellType=="NPC"]
#Remove sample KS1_7 as it had a weird Karyotype
dat.npc<-dat.npc[,pData(dat.npc)$clone != "KS1-7"]
dat.cds<-dat.cds[,pData(dat.cds)$clone != "KS1-7"]

#Estimate size factors and dispersions
dat.npc<-estimateSizeFactors(dat.npc)
dat.npc<-estimateDispersions(dat.npc)
## Warning: Deprecated, use tibble::rownames_to_column() instead.
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Removing 264 outliers
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced

## Warning in log(ifelse(y == 0, 1, y/mu)): step size truncated due to
## divergence
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Warning: glm.fit: algorithm did not converge
## Warning in parametricDispersionFit(disp_table[row.names(disp_table) %in% :
## Dispersion fit did not converge.
dat.cds<-estimateSizeFactors(dat.cds)
dat.cds<-estimateDispersions(dat.cds)
## Warning: Deprecated, use tibble::rownames_to_column() instead.
## Removing 183 outliers
#Detect genes and impose a cutoff for the number of cells expressing a gene (remember to ask Loyal why we are doing this and why he set it at 20)
dat.npc<-detectGenes(dat.npc)
cellCutoff<-20
expressed_genes_npc<-row.names(subset(fData(dat.npc),
    num_cells_expressed >= cellCutoff))

dat.cds<-detectGenes(dat.cds)
cellCutoff<-20
expressed_genes_cds<-row.names(subset(fData(dat.cds),
    num_cells_expressed >= cellCutoff))

# find high variance genes
disp_table_npc <- dispersionTable(dat.npc)
unsup_clustering_genes <- subset(disp_table_npc, mean_expression >= 0.0005 & dispersion_empirical >= 1*dispersion_fit)
dat.npc <- setOrderingFilter(dat.npc, unsup_clustering_genes$gene_id)
plot_ordering_genes(dat.npc)
## Warning: Transformation introduced infinite values in continuous y-axis

disp_table_cds <- dispersionTable(dat.cds)
unsup_clustering_genes_cds <- subset(disp_table_cds, mean_expression >= 0.0005 & dispersion_empirical >= 1*dispersion_fit)
dat.cds <- setOrderingFilter(dat.cds, unsup_clustering_genes_cds$gene_id)
plot_ordering_genes(dat.cds)
## Warning: Transformation introduced infinite values in continuous y-axis

#differential gene test for NPCs only
npc.KS_vs_WT<-differentialGeneTest(dat.npc[expressed_genes_npc,],
                                      fullModelFormulaStr = "~num_genes_expressed + genotype", reducedModelFormulaStr = "~num_genes_expressed",cores=4)

NPC.KS_vs_WT.sig_genes <- subset(npc.KS_vs_WT, qval < 0.000000000000000000000000000000001)
write.table(NPC.KS_vs_WT.sig_genes, "KS_vs_WT.sig_genes", sep = "\t")

#Subset dat.npc by differentially expressed genes
diff_npcs <- dat.npc[row.names(subset(fData(dat.npc),
              gene_short_name %in% NPC.KS_vs_WT.sig_genes$gene_short_name)),]
# quick QC for all cells
pData(dat.cds)$Total_mRNAs <- Matrix::colSums(exprs(dat.cds))

qplot(Total_mRNAs, data = pData(dat.cds), fill = clone, geom = "density", alpha = 0.1) + 
    facet_wrap("clone") + guides(fill = "none") + theme_bw()

# quick QC for NPCs
pData(dat.npc)$Total_mRNAs <- Matrix::colSums(exprs(dat.npc))

qplot(Total_mRNAs, data = pData(dat.npc), fill = clone, geom = "density", alpha = 0.1) + 
    facet_wrap("clone") + guides(fill = "none") + theme_bw()

# subset the diff_npcs object to seperate by genotype
diff_KS <- diff_npcs[, pData(diff_npcs)$genotype %in% "KS1"]
diff_WT <- diff_npcs[, pData(diff_npcs)$genotype %in% "WT"]

# calculate the mean_cpc for both groups
fData(diff_KS)$mean_cpc <- as.vector(apply(exprs(diff_KS), 1, mean))
fData(diff_WT)$mean_cpc <- as.vector(apply(exprs(diff_WT), 1, mean))

# make df that is genes by genotype containing mean_cpc
mean.genotype <- data.frame(row.names = rownames(fData(diff_KS)), KS = fData(diff_KS)$mean_cpc, 
    WT = fData(diff_WT)$mean_cpc)

# determine if the KS genes are up relative to WT genes
mean.genotype$KS.dir <- ifelse(mean.genotype$KS > mean.genotype$WT, "UP", "DOWN")

# subset by KS direction
upregulated.KS <- subset(mean.genotype, KS.dir == "UP")
downregulated.KS <- subset(mean.genotype, KS.dir == "DOWN")

# make a vector of the genes
UP.KS <- rownames(upregulated.KS)
DOWN.KS <- rownames(downregulated.KS)

# subset the original differentially expressed genes data frame using these
# lists
upregulated.KS <- subset(NPC.KS_vs_WT.sig_genes, rownames(NPC.KS_vs_WT.sig_genes) %in% 
    UP.KS)
write.table(upregulated.KS, "upregulated_KS.txt", sep = "\t")

downregulated.KS <- subset(NPC.KS_vs_WT.sig_genes, rownames(NPC.KS_vs_WT.sig_genes) %in% 
    DOWN.KS)
write.table(downregulated.KS, "downregulated_KS.txt", sep = "\t")
# load differential gene expression between conditions as determined by
# Moncole and generate a vector of differentially expressed short gene names
npc_diff_genes <- NPC.KS_vs_WT.sig_genes$gene_short_name
npc_tmp <- bitr(npc_diff_genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(npc_diff_genes, fromType = "SYMBOL", toType = "ENTREZID", :
## 6.02% of input gene IDs are fail to map...
npc_diff_ids <- npc_tmp$ENTREZID

# Biological Process
npc_diff_BP <- enrichGO(npc_diff_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "BP", readable = TRUE)
barplot(npc_diff_BP, drop = TRUE, showCategory = 50)

dotplot(npc_diff_BP)

enrichMap(npc_diff_BP, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_diff_BP), "npc_diff_BP_summary.txt", sep = "\t")

## Cellular Compartment
npc_diff_CC <- enrichGO(npc_diff_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "CC", readable = TRUE)
barplot(npc_diff_CC, drop = TRUE, showCategory = 50)

dotplot(npc_diff_CC)

enrichMap(npc_diff_CC, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_diff_CC), "npc_diff_CC_summary.txt", sep = "\t")

# Molecular Function
npc_diff_MF <- enrichGO(npc_diff_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "MF", readable = TRUE)
barplot(npc_diff_MF, drop = TRUE, showCategory = 50)

dotplot(npc_diff_MF)

enrichMap(npc_diff_MF, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_diff_MF), "npc_diff_MF_summary.txt", sep = "\t")
# load differential gene expression between conditions as determined by
# Moncole and generate a vector of differentially expressed short gene names
npc_KS_UP_genes <- upregulated.KS$gene_short_name
npc_tmp <- bitr(npc_KS_UP_genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(npc_KS_UP_genes, fromType = "SYMBOL", toType =
## "ENTREZID", : 8.5% of input gene IDs are fail to map...
npc_KS_UP_ids <- npc_tmp$ENTREZID

# Biological Process
npc_KS_UP_BP <- enrichGO(npc_KS_UP_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "BP", readable = TRUE)
barplot(npc_KS_UP_BP, drop = TRUE, showCategory = 50)

dotplot(npc_KS_UP_BP)

enrichMap(npc_KS_UP_BP, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_KS_UP_BP), "npc_UP_KS_BP_summary.txt", sep = "\t")

## Cellular Compartment
npc_KS_UP_CC <- enrichGO(npc_KS_UP_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "CC", readable = TRUE)
barplot(npc_KS_UP_CC, drop = TRUE, showCategory = 50)

dotplot(npc_KS_UP_CC, showCategory = 50)

enrichMap(npc_KS_UP_CC, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_KS_UP_CC), "npc_KS_UP_CC_summary.txt", sep = "\t")

# Molecular Function
npc_KS_UP_MF <- enrichGO(npc_KS_UP_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "MF", readable = TRUE)
barplot(npc_KS_UP_MF, drop = TRUE, showCategory = 50)

dotplot(npc_KS_UP_MF)

enrichMap(npc_KS_UP_MF, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_KS_UP_MF), "npc_KS_UP_MF_summary.txt", sep = "\t")

# Reactome
UP <- enrichPathway(gene = npc_KS_UP_ids, pvalueCutoff = 0.05, readable = T)
barplot(UP, showCategory = 50)

write.table(as.data.frame(UP), "KS_UP_PATHWAY.txt", sep = "\t")
# load differential gene expression between conditions as determined by
# Moncole and generate a vector of differentially expressed short gene names
npc_KS_DOWN_genes <- downregulated.KS$gene_short_name
npc_tmp <- bitr(npc_KS_DOWN_genes, fromType = "SYMBOL", toType = "ENTREZID", 
    OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(npc_KS_DOWN_genes, fromType = "SYMBOL", toType =
## "ENTREZID", : 2.68% of input gene IDs are fail to map...
npc_KS_DOWN_ids <- npc_tmp$ENTREZID

# Biological Process
npc_KS_DOWN_BP <- enrichGO(npc_KS_DOWN_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "BP", readable = TRUE)
barplot(npc_KS_DOWN_BP, drop = TRUE, showCategory = 50)

dotplot(npc_KS_DOWN_BP)

enrichMap(npc_KS_DOWN_BP, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_KS_DOWN_BP), "npc_DOWN_KS_BP_summary.txt", sep = "\t")

## Cellular Compartment
npc_KS_DOWN_CC <- enrichGO(npc_KS_DOWN_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "CC", readable = TRUE)
barplot(npc_KS_DOWN_CC, drop = TRUE, showCategory = 50)

dotplot(npc_KS_DOWN_CC, showCategory = 50)

enrichMap(npc_KS_DOWN_CC, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_KS_DOWN_CC), "npc_KS_DOWN_CC_summary.txt", sep = "\t")

# Molecular Function
npc_KS_DOWN_MF <- enrichGO(npc_KS_DOWN_ids, "org.Hs.eg.db", keytype = "ENTREZID", 
    ont = "MF", readable = TRUE)
barplot(npc_KS_DOWN_MF, drop = TRUE, showCategory = 50)

dotplot(npc_KS_DOWN_MF)

enrichMap(npc_KS_DOWN_MF, n = 20, fixed = TRUE, vertex.label.font = 1, vertex.label.cex = 0.3)

write.table(as.data.frame(npc_KS_DOWN_MF), "npc_KS_DOWN_MF_summary.txt", sep = "\t")

# Reactome
DOWN <- enrichPathway(gene = npc_KS_DOWN_ids, pvalueCutoff = 0.05, readable = T)
barplot(DOWN, showCategory = 50)

write.table(as.data.frame(DOWN), "KS_DOWN_PATHWAY.txt", sep = "\t")
# tSNE for NPCs
plot_pc_variance_explained(dat.npc, return_all = F)

dat.npc <- reduceDimension(dat.npc, max_components = 2, num_dim = 4, reduction_method = "tSNE", 
    verbose = T)
## Remove noise by PCA ...
## Reduce dimension by tSNE ...
dat.npc <- clusterCells(dat.npc, num_clusters = 2)
## Warning in if (method == "DDRTree") {: the condition has length > 1 and
## only the first element will be used
## Warning in if (method == "densityPeak") {: the condition has length > 1 and
## only the first element will be used
## Distance cutoff calculated to 5.25722
plot_cell_clusters(dat.npc, 1, 2, color = "clone", markers = c("PAX6"))

tsne.npc <- t(reducedDimA(dat.npc))
tSNE_cluster_npcs <- ggplot(pData(dat.npc)) + geom_point(aes(x = tsne.npc[, 
    1], y = tsne.npc[, 2], color = clone), alpha = 0.5) + theme_bw() + scale_color_brewer(palette = "Set1") + 
    labs(x = "tSNE 1", y = "tSNE 2", color = "Clone")

tSNE_cluster_npcs

# tSNE for all cells
plot_pc_variance_explained(dat.cds, return_all = F)

dat.cds <- reduceDimension(dat.cds, max_components = 2, num_dim = 4, reduction_method = "tSNE", 
    verbose = T)
## Remove noise by PCA ...
## Reduce dimension by tSNE ...
dat.cds <- clusterCells(dat.cds, num_clusters = 2)
## Warning in if (method == "DDRTree") {: the condition has length > 1 and
## only the first element will be used

## Warning in if (method == "DDRTree") {: the condition has length > 1 and
## only the first element will be used
## Distance cutoff calculated to 6.600245
plot_cell_clusters(dat.cds, 1, 2, color = "clone")

tsne.cds <- t(reducedDimA(dat.cds))
tSNE_cluster_cds <- ggplot(pData(dat.cds)) + geom_point(aes(x = tsne.cds[, 1], 
    y = tsne.cds[, 2], color = clone, shape = cellType), alpha = 0.5) + theme_bw() + 
    scale_color_brewer(palette = "Set1") + labs(x = "tSNE 1", y = "tSNE 2", 
    color = "Clone", shape = "Cell Type")

tSNE_cluster_cds